Guided analysis

library(tidyverse)
library(ggplot2)
library(plotly)

Read in the gapminder_clean.csv data as a tibble using read_csv

data <- read.csv('gapminder_clean.csv') %>%
  as_tibble()

Renaming some columns to avoid the repetition of bloated names

colnames(data)[colnames(data) == "CO2.emissions..metric.tons.per.capita."] <- "co2_emissions"
colnames(data)[colnames(data) == "Population.density..people.per.sq..km.of.land.area."] <- "population_density"
colnames(data)[colnames(data) == "Imports.of.goods.and.services....of.GDP."] <- "imports"
colnames(data)[colnames(data) == "Energy.use..kg.of.oil.equivalent.per.capita."] <- "energy_use"
colnames(data)[colnames(data) == "Life.expectancy.at.birth..total..years."] <- "life_expectancy"
data_on_62 <- data %>% 
  filter(Year==1962)
ggplot(data_on_62, aes(x=co2_emissions, y = gdpPercap)) +
  geom_point()+
  labs(x="CO2 emissions (metric tons per capita)",y="GDP per capita",
       title="GDP per capita variation according to CO2 emissions")

cor_test_res <- cor.test(data_on_62$co2_emissions, data_on_62$gdpPercap)
cor_test_res
## 
##  Pearson's product-moment correlation
## 
## data:  data_on_62$co2_emissions and data_on_62$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8934697 0.9489792
## sample estimates:
##       cor 
## 0.9260817
highest_cor <- data %>%
  select(Country.Name,Year,gdpPercap,co2_emissions) %>%
  drop_na() %>%
  group_by(Year) %>% 
  summarize(cor = cor(co2_emissions, gdpPercap)) %>%
  top_n(1,cor)
summary(highest_cor)
##       Year           cor        
##  Min.   :1967   Min.   :0.9388  
##  1st Qu.:1967   1st Qu.:0.9388  
##  Median :1967   Median :0.9388  
##  Mean   :1967   Mean   :0.9388  
##  3rd Qu.:1967   3rd Qu.:0.9388  
##  Max.   :1967   Max.   :0.9388
co2_gdp_scatterplot <- data_on_62 %>%
  select(Country.Name,Year,co2_emissions,continent,pop,gdpPercap) %>%
  drop_na() %>%
  ggplot(aes(x=co2_emissions, 
             y=gdpPercap,
             color=continent,
             size=pop)) + 
  geom_point(alpha=0.5) +
  labs(x="CO2 emissions (metric tons per capita)",y="GDP per capita",
       title="GDP per capita variation according to CO2 emissions",) +
  scale_color_discrete(name ="Continent") +
  scale_size('', range=c(1, 10))
ggplotly(co2_gdp_scatterplot)
energy_continent_anova <- aov(energy_use ~ continent, data = data)
summary(energy_continent_anova)
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## continent      5 8.124e+08 162482656   21.88 <2e-16 ***
## Residuals   1404 1.043e+10   7426183                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness
relevant_continents <- c("Europe","Asia")

data_as_eu_after_90 <- data %>% 
  select(Country.Name,Year,imports,continent) %>%
  filter(continent %in% relevant_continents, Year>1990)

europe_asia_imports_t_test <- t.test(data_as_eu_after_90$imports[data_as_eu_after_90$continent=='Europe'],data_as_eu_after_90$Imports.of.goods.and.services....of.GDP.[data_as_eu_after_90$continent=='Asia'])
## Warning: Unknown or uninitialised column:
## `Imports.of.goods.and.services....of.GDP.`.
europe_asia_imports_t_test
## 
##  One Sample t-test
## 
## data:  data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Europe"]
## t = 26.684, df = 113, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  38.68652 44.89196
## sample estimates:
## mean of x 
##  41.78924
years <- unique(data$Year)
countries = unique(data$Country.Name[!is.na(data$Country.Name)])

pop_density_ranking <- rep(0, times=length(countries))
names(pop_density_ranking) <- countries

for (x in years) {
  year_data <- data %>% 
    select(Country.Name,Year,population_density) %>%
    na.omit()  %>%
    filter(Year == x)
  year_data$population_density <- rank(year_data$population_density,na.last = TRUE)
  for (z in year_data$Country.Name) {
    pop_density_ranking[[z]] <- pop_density_ranking[[z]] + year_data$population_density[year_data$Country.Name==z]
  }
}

pop_density_ranking <- pop_density_ranking %>%
  sort(decreasing = TRUE) %>%
  replace(pop_density_ranking==0, NA)

head(pop_density_ranking)
##     Macao SAR, China               Monaco Hong Kong SAR, China 
##                 2553                 2553                 2537 
##            Singapore            Gibraltar              Bermuda 
##                 2529                 2518                 2506
pop_density_plot <- ggplot(data, aes(x = Year, y = log10(population_density), group = Country.Name, color = Country.Name, label = Country.Name)) +
  geom_line() +
  labs(y="Population density (log10(people/square km of land area))",x="Year",
    title="Population density variation according to year per country",)
ggplotly(pop_density_plot)
life_expectancy_diff <- rep(0, times=length(countries))
names(life_expectancy_diff) <- countries

for (z in countries) {
  country_life_exp <- data %>% 
    select(Country.Name,Year,life_expectancy) %>%
    filter(Country.Name==z)
  years <- unique(country_life_exp$Year)
  life_expectancy_diff[z] <- country_life_exp$life_expectancy[country_life_exp$Year==tail(years,1)] - country_life_exp$life_expectancy[country_life_exp$Year==head(years,1)] 
}

life_expectancy_diff <- life_expectancy_diff %>%
  sort(decreasing = TRUE)

head(life_expectancy_diff)
##    Maldives      Bhutan Timor-Leste     Tunisia        Oman       Nepal 
##    36.91615    33.19895    31.08515    30.86076    30.82310    30.59963

Including Plots

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.